Author: Dan Wexler
Date: 2024-04-23

This script helps the user identify correlated predictor variables and allows them to choose the variables on which the random forest model will initially be trained. Below is a list of the inputs and outputs.

INPUTS: (in ‘1_training_input’ folder)
1) .csv file generated by Google Earth Engine and ArcGIS Pro containing predictor variable information for disturbance polygons. (UPLOAD)
OUTPUTS: (in ‘2_starting_variables’ folder)
2) List of predictor variables.

(1) Setup

Here we load the R libraries that are used throughout the script. Some or all of them may have to be downloaded.

# loads R libraries
library(DT)
library(dplyr)
library(here)
library(ggcorrplot)
library(ggplot2)
library(plotly)

This block of code contains variables that the user can modify to fit their specific needs. This is the only block of code that requires user input, barring more involved modification of the script. The in-line comments detail specifically what each variable represents.

IMPORTANT: This script is meant to be run multiple times. The first time, it is recommended that all variables are included (set to ‘T’ below). After looking at the categorical correlations, remove variables (set to ‘F’ below) that will reduce the number of correlated pairs and run the script again to see what has changed. The variables that are set to ‘T’ are the ones that will be included in the training script, so before moving on to training ensure that the desired variables are set to ‘T’ and run this script one final time. Ensure that the park variable is set correctly.

# the name of the park (MORA, OLYM, or NOCA)
park <- "MORA"
# the name of the file containing the predictor variables
predictors_file <- "predictors.csv"
# the correlation cutoff for pairs of variables
correlation_cutoff <- 0.85
# if TRUE, filters out patches in the elevation/vegetation mask
filter_out_mask <- FALSE
# if TRUE, filters out patches from the year 1987
drop_1987 <- TRUE
# ---------------------------------------------------------------------------- #
# THE NEXT FOUR VARIABLES CONTROL WHICH PATCHES ARE INCLUDED IN THE VARIABLE
# SELECTION PROCESS AND WHICH ARE FILTERED OUT. SET ONE (AND ONLY ONE) TO TRUE. 
# IF ALL FOUR ARE SET TO FALSE, THE ENTIRE STUDY AREA WILL BE USED.
# ---------------------------------------------------------------------------- #
# if TRUE, filters patches to those within the protected areas and buffer
model_pa_and_buffer <- TRUE
# if TRUE, filters patches to those within the park and buffer
model_park_and_buffer <- FALSE
# if TRUE, filters patches to those within the protected areas
model_pa <- FALSE
# if TRUE, filters patches to those within the park
model_park <- FALSE
# ---------------------------------------------------------------------------- #
# spectral predictor variables (set to 'T' or 'F')
spec <- c("idxMagMn" = T,
          "idxMagSd" = T,
          "tcbMagMn" = T,
          "tcbMagSd" = T,
          "tcbPreMn" = T,
          "tcbPreSd" = T,
          "tcbPst01Mn" = T,
          "tcbPst01Sd" = T,
          "tcbPst03Mn" = T,
          "tcbPst03Sd" = T,
          "tcbPst07Mn" = T,
          "tcbPst07Sd" = T,
          "tcbPst15Mn" = T,
          "tcbPst15Sd" = T,
          "tcbPstMn" = T,
          "tcbPstSd" = T,
          "tcgMagMn" = T,
          "tcgMagSd" = T,
          "tcgPreMn" = T,
          "tcgPreSd" = T,
          "tcgPst01Mn" = T,
          "tcgPst01Sd" = T,
          "tcgPst03Mn" = T,
          "tcgPst03Sd" = T,
          "tcgPst07Mn" = T,
          "tcgPst07Sd" = T,
          "tcgPst15Mn" = T,
          "tcgPst15Sd" = T,
          "tcgPstMn" = T,
          "tcgPstSd" = T,
          "tcwMagMn" = T,
          "tcwMagSd" = T,
          "tcwPreMn" = T,
          "tcwPreSd" = T,
          "tcwPst01Mn" = T,
          "tcwPst01Sd" = T,
          "tcwPst03Mn" = T,
          "tcwPst03Sd" = T,
          "tcwPst07Mn" = T,
          "tcwPst07Sd" = T,
          "tcwPst15Mn" = T,
          "tcwPst15Sd" = T,
          "tcwPstMn" = T,
          "tcwPstSd" = T)
# landscape predictor variables (set to 'T' or 'F')
land <- c("elev_max" = T,
          "elev_mean" = T,
          "elev_min" = T,
          "slope_max" = T,
          "slope_mean" = T,
          "slope_min" = T,
          "dist_stream_mean" = T,
          "dist_stream_std" = T,
          "eastness" = T,
          "northness" = T,
          "tpi500_mean" = T,
          "tpi500_std" = T,
          "tpi2000_mean" = T,
          "tpi2000_std" = T,
          "tci_mean" = T,
          "tci_std" = T)
# geometric predictor variables (set to 'T' or 'F')
geom <- c("MajorAxis" = T,
          "MinorAxis" = T,
          "Orientatio" = T,
          "area" = T,
          "perim" = T,
          "Thickness" = T,
          "paratio" = T)
# space and time predictor variables (set to 'T' or 'F')
spti <- c("yod" = T,
          "durMn" = T,
          "durSd" = T,
          "latitude" = T,
          "longitude" = T)

This block of code contains a function for displaying tables that will be used throughout the script.

# function for displaying tables
create_table <- function(table, text, rows) {
  # finds number of highly correlated variables and adds number to title
  num_pairs <- nrow(table[table$`Above Cutoff?` == "yes",])
  title <- paste0(park, " | ", text, " (number of correlated pairs: ", num_pairs, ")")
  # displays the table
  DT::datatable(table, class = "cell-border stripe hover", 
                rownames = rows, caption = title,
                options = list(dom = "ftp", order = list(classes = "no-sort")))
}

This block of code contains a function for creating a table with correlation information between pairs of variables.

# function for creating correlation table
correlation_table <- function(correlation) {
  # gets the list and number of variables
  vars <- rownames(correlation)
  num_vars <- length(vars)
  # creates a new data frame to store results
  correlated_vars <- data.frame()
  # loops through the upper triangle of the correlation matrix
  for (i in 1:(num_vars - 1)) {
    for (j in (i + 1):num_vars) {
      # if the correlation between two variables is above the threshold,
      # sets the 'above_cutoff' flag to 'yes'
      above_cutoff <- "no"
      if (abs(correlation[i, j]) >= correlation_cutoff) {
        above_cutoff <- "yes"
      }
      # adds a row to the results data frame
      corr <- correlation[i, j]
      correlated_vars <- rbind(correlated_vars, c(vars[i], vars[j], round(corr, 2), above_cutoff, abs(corr)))
    }
  }
  # formats and returns results data frame
  colnames(correlated_vars) <- c("Variable 1", "Variable 2", "Correlation", "Above Cutoff?", "Sort")
  correlated_vars$Sort <- as.numeric(correlated_vars$Sort)
  correlated_vars <- correlated_vars[order(correlated_vars$Sort, decreasing = TRUE),]
  return (correlated_vars)
}

This block of code contains a function that formats and displays a correlation matrix for a certain set of predictor variables.

# sets plot size
fig_size <- 8.5
# function for displaying correlation matrix
correlation_plot <- function(correlation, title) {
  # gets the number of variables
  num_vars <- nrow(correlation)
  # reverses rows of correlation matrix for readability
  correlation <- correlation[,num_vars:1]
  # creates vector that will label correlated squares with 'x'
  label <- c(ifelse(abs(correlation) >= correlation_cutoff & correlation != 1, "x", ""))
  # plots correlation matrix
  plot <- ggcorrplot(correlation) + labs(title = title) +
    geom_text(aes(label = label), color = "white") +
    theme(plot.title = element_text(hjust = 0.5, size = 12),
          axis.text.x = element_text(size = 10, angle = 65), 
          axis.text.y = element_text(size = 10))
  ggplotly(plot, tooltip = c("value", "Var1", "Var2")) %>% config(displayModeBar = FALSE)
}

Here we load the predictor variables and subset them by the predictors that the user has chosen to include.

# loads the predictor variables
x <- data.frame(read.csv(here(park, "1_training_input", predictors_file)))
# filters out patches within the elevation mask 
if (filter_out_mask) {
  x <- x[x$In_Mask != "Yes",]
}
# filters out patches from the year 1987
if (drop_1987) {
  x <- x[x$yod != 1987,]
}
# filters patches based on location in the study area
if (model_pa_and_buffer) {
  x <- x[which(x$Protected == "Yes" | x$In_Buffer == "Yes"),]
} else if (model_park_and_buffer) {
  x <- x[which(x$In_Park == "Yes" | x$In_Buffer == "Yes"),]
} else if (model_pa) {
  x <- x[x$Protected == "Yes",]
} else if (model_park) {
  x <- x[x$In_Park == "Yes",]
}
# assembles a list of all predictors
predictors <- c(spec, land, geom, spti)
# filters data by chosen predictors
selected_vars <- names(predictors)[unlist(predictors)]
x <- subset(x, select = selected_vars)
# converts predictor data to numeric and changes NA values to 0
x <- data.frame(sapply(x, as.numeric))
x[is.na(x)] <- 0

(2) Categorical Correlation

Spectral

Use the correlation table and matrix below to determine which variables to keep and which to remove. This block of code displays a table with correlation information between spectral variables.

# gets list of spectral variables and creates correlation matrix
selected_vars <- names(spec)[unlist(spec)]
if (length(selected_vars) > 1) {
  spec_matrix <- cor(subset(x, select = selected_vars), method = "spearman")
  # creates and displays table
  spec_correlated <- correlation_table(spec_matrix)
  create_table(spec_correlated[,1:4], "Spectral Variable Correlations", FALSE)
}

This block of code displays a correlation matrix for spectral variables. Zoom in on an area by clicking and dragging, and reset the view to default by double clicking the plot.

# creates correlation matrix
if (length(selected_vars) > 1) {
  correlation_plot(spec_matrix, paste(park, "Spectral Correlation Matrix"))
}

Landscape

Use the correlation table and matrix below to determine which variables to keep and which to remove. This block of code displays a table with correlation information between landscape variables.

# gets list of landscape variables and creates correlation matrix
selected_vars <- names(land)[unlist(land)]
if (length(selected_vars) > 1) {
  land_matrix <- cor(subset(x, select = selected_vars), method = "spearman")
  # creates and displays table
  land_correlated <- correlation_table(land_matrix)
  create_table(land_correlated[,1:4], "Landscape Variable Correlations", FALSE)
}

This block of code displays a correlation matrix for landscape variables. Zoom in on an area by clicking and dragging, and reset the view to default by double clicking the plot.

# creates correlation matrix
if (length(selected_vars) > 1) {
  correlation_plot(land_matrix, paste(park, "Landscape Correlation Matrix"))
}

Geometric

Use the correlation table and matrix below to determine which variables to keep and which to remove. This block of code displays a table with correlation information between geometric variables.

# gets a list of geometric variables and creates correlation matrix
selected_vars <- names(geom)[unlist(geom)]
if (length(selected_vars) > 1) {
  geom_matrix <- cor(subset(x, select = selected_vars), method = "spearman")
  # creates and displays table
  geom_correlated <- correlation_table(geom_matrix)
  create_table(geom_correlated[,1:4], "Geometric Variable Correlations", FALSE)
}

This block of code displays a correlation matrix for geometric variables. Zoom in on an area by clicking and dragging, and reset the view to default by double clicking the plot.

# creates correlation matrix
if (length(selected_vars) > 1) {
  correlation_plot(geom_matrix, paste(park, "Geometric Correlation Matrix"))
}

Space and Time

Use the correlation table and matrix below to determine which variables to keep and which to remove. This block of code displays a table with correlation information between space and time variables.

# gets a list of space and time variables and creates correlation matrix
selected_vars <- names(spti)[unlist(spti)]
if (length(selected_vars) > 1) {
  spti_matrix <- cor(subset(x, select = selected_vars), method = "spearman")
  # creates and displays table
  spti_correlated <- correlation_table(spti_matrix)
  create_table(spti_correlated[,1:4], "Correlated Space and Time Variables", FALSE)
}

This block of code displays a correlation matrix for space and time variables. Zoom in on an area by clicking and dragging, and reset the view to default by double clicking the plot.

# creates correlation matrix
if (length(selected_vars) > 1) {
  correlation_plot(spti_matrix, paste(park, "Space and Time Correlation Matrix"))
}

(3) Final Correlation

This block of code displays a table containing correlation information between pairs of variables, regardless of category.

# creates correlation matrix for all variables
totl_matrix <- cor(x, method = "spearman")
totl_correlated <- correlation_table(totl_matrix)
# displays table
create_table(totl_correlated[,1:4], "All Variable Correlations", FALSE)

(4) Saving Variables

Saves the variables that are set to ‘T’. These are there variables that the training script will start modeling with, so be sure to check which variables are currently set to ‘T’ before running this script for a final time.

# saves the variables the user has selected
to_save <- data.frame(names(predictors)[unlist(predictors)])
colnames(to_save) <- "vars"
file_name <- paste0("starting_variables_", park, ".csv")
write.csv(to_save, file = here(park, "2_starting_variables", file_name), row.names = FALSE)